package au.com.acpfg.misc.muscle; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.StringReader; import java.util.HashMap; import java.util.Set; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.RowIterator; import org.knime.core.data.collection.ListCell; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.JoinedRow; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.ModelContent; import org.knime.core.node.ModelContentRO; import org.knime.core.node.ModelContentWO; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModel; import org.knime.core.node.defaultnodesettings.SettingsModelColumnName; import org.knime.core.node.defaultnodesettings.SettingsModelString; import pal.datatype.AminoAcids; import uk.ac.ebi.webservices.axis1.MuscleClient; import uk.ac.ebi.webservices.axis1.stubs.muscle.InputParameters; /** * This is the model implementation of MuscleAccessor. * Provides multiple alignment data from MUSCLE as implemented by EBI * * @author Andrew Cassin */ public class MuscleAccessorNodeModel extends NodeModel { // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(MuscleAccessorNodeModel.class); /** the settings key which is used to retrieve and store the settings (from the dialog or from a settings file) (package visibility to be usable from the dialog). */ static final String CFGKEY_EMAIL = "email"; static final String CFGKEY_SEQ_COL = "sequence-column"; // list of sequences to align (NB: only one muscle alignment PER ROW) static final String CFGKEY_ACCSN_COL = "accession-column"; // descriptor for sequences static final String CFGKEY_ALIGNMENT_TYPE = "alignment-type"; static final String DEFAULT_EMAIL = "must.set.this@to.use.this.node"; static final String DEFAULT_ALIGNMENT_TYPE = "Protein Sequences"; // example value: the models count variable filled from the dialog // and used in the models execution method. The default components of the // dialog work with "SettingsModels". private final SettingsModelString m_email = make_as_string(CFGKEY_EMAIL); private final SettingsModel m_seq_col = make(CFGKEY_SEQ_COL); private final SettingsModel m_accsn_col = make(CFGKEY_ACCSN_COL); //TODO private final SettingsModel m_alignment_type = make(CFGKEY_ALIGNMENT_TYPE); /* internal model state */ private final HashMap<String,MultiAlignmentCell> m_muscle_map = new HashMap<String,MultiAlignmentCell>(); /** * Constructor for the node model. */ protected MuscleAccessorNodeModel() { super(1, 1); } public static SettingsModel make(String k) { if (k.equals(CFGKEY_EMAIL)) { return new SettingsModelString(CFGKEY_EMAIL, DEFAULT_EMAIL ); } else if (k.equals(CFGKEY_SEQ_COL)) { return new SettingsModelColumnName(CFGKEY_SEQ_COL, "Sequences"); } else if (k.equals(CFGKEY_ACCSN_COL)) { return new SettingsModelColumnName(CFGKEY_ACCSN_COL, "Accessions"); } else if (k.equals(CFGKEY_ALIGNMENT_TYPE)) { return new SettingsModelString(k, DEFAULT_ALIGNMENT_TYPE); } return null; } public static SettingsModelString make_as_string(String k) { return (SettingsModelString) make(k); } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { if (m_email.equals(DEFAULT_EMAIL)) { throw new Exception("You must set a valid E-Mail for EBI to contact you in the event of problems with the service!"); } int n_rows = inData[0].getRowCount(); int seq_idx = inData[0].getSpec().findColumnIndex(((SettingsModelString)m_seq_col).getStringValue()); int accsn_idx= inData[0].getSpec().findColumnIndex(((SettingsModelString)m_accsn_col).getStringValue()); if (seq_idx < 0 || accsn_idx < 0) { throw new Exception("Cannot find columns... valid data?"); } int done = 0; // create the output columns (raw format for use with R) DataTableSpec outputSpec = new DataTableSpec(inData[0].getDataTableSpec(), make_output_spec()); BufferedDataContainer container = exec.createDataContainer(outputSpec, false, 0); // instantiate MUSCLE client MuscleClient cli = new MuscleClient(); // each row is a separate MUSCLE job, the sequences are in one collection cell, the accessions (IDs) in the other RowIterator it = inData[0].iterator(); while (it.hasNext()) { DataRow r = it.next(); ListCell seqs = (ListCell) r.getCell(seq_idx); ListCell accsns = (ListCell) r.getCell(accsn_idx); if (seqs.size() != accsns.size()) { throw new Exception("Every sequence must have a corresponding accession: error at row "+r.getKey().getString()); } if (seqs.size() < 1) { throw new Exception("Cannot MUSCLE zero sequences: error at row "+r.getKey().getString()); } if (seqs.size() > 1000) { throw new Exception("Too many sequences in row "+r.getKey().getString()); } // dummy a fake "FASTA" file (in memory) and then submit that to MUSCLE@EBI along with other necessary parameters StringBuffer seq_as_fasta = new StringBuffer(); for (int i=0; i<seqs.size(); i++) { seq_as_fasta.append(">"); seq_as_fasta.append(accsns.get(i).toString()); seq_as_fasta.append("\n"); seq_as_fasta.append(seqs.get(i).toString()); seq_as_fasta.append("\n"); } //System.err.println(seq_as_fasta); // lodge the muscle job and store the results in the output table InputParameters ip = new InputParameters(); ip.setSequence(seq_as_fasta.toString()); // start the job String jobId = cli.runApp(m_email.getStringValue(), r.getKey().getString(), ip); exec.checkCanceled(); exec.setProgress(((double)done)/n_rows, "Executing "+jobId); Thread.sleep(20 * 1000); // 20 seconds waitForCompletion(cli, exec, jobId); done++; // process results and add them into the table... // 1. fasta alignment data byte[] bytes = cli.getSrvProxy().getResult(jobId, "aln-fasta", null); DataCell[] cells = new DataCell[3]; cells[0] = new StringCell(jobId); // compute the base64 encoded phylip aligned sequences suitable for use by R's phangorn package String fasta = new String(bytes); String ret = fasta2phylip(fasta); // it must be encoded (I chose base64) as it is common to both Java and R and it must be // encoded due to containing multiple lines, which confuses the CSV passed between KNIME and R String rk = r.getKey().getString(); DataCell mac = AlignmentCellFactory.createCell(fasta, new AminoAcids()); if (mac instanceof MultiAlignmentCell) m_muscle_map.put(rk, (MultiAlignmentCell) mac); cells[1] = mac; bytes = cli.getSrvProxy().getResult(jobId, "out", null); cells[2] = new StringCell("<html><pre>"+new String(bytes)); container.addRowToTable(new JoinedRow(r, new DefaultRow(r.getKey(), cells))); } container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[]{out}; } /** * Ensures an accession from MUSCLE is in a format suitable for phylip. Each accession must be unique to 10chars (sigh) * */ protected String fasta_accession2phylip(String accsn) { if (accsn.length() > 10) { return accsn.substring(0, 10); } else { StringBuffer tmp = new StringBuffer(accsn); int n_spaces = 10 - accsn.length(); while (n_spaces-- > 0) { tmp.append(" "); } return tmp.toString(); } } /** * Returns the list of row ID (as strings) in the muscle map member */ public Set<String> getMuscleMapIDs() { Set<String> ret = m_muscle_map.keySet(); return m_muscle_map.keySet(); } /** * Returns the alignment cell which corresponds to the specified rowkey or <tt>null</tt> if the key does not exist */ public MultiAlignmentCell getAlignment(String rk) { return m_muscle_map.get(rk); } public String getFormattedAlignment(String rk) { return getFormattedAlignment(rk, FormattedRenderer.FormatType.F_CLUSTALW); } public String getFormattedAlignment(String rk, FormattedRenderer.FormatType format) { MultiAlignmentCell cell = getAlignment(rk); return cell.getFormattedAlignment(format); } /** * Converts the fasta sequences into phylip format: * <number of sequences> <length of aligned sequences> * <id - exactly ten-space character padded><sequence for id> * ... * * @param fasta_aligned_sequences * @return the phylip formatted result */ protected String fasta2phylip(String fasta_aligned_sequences) throws IOException { StringBuffer seq = new StringBuffer(); int count = 0; int length = 0; boolean prev = false; BufferedReader rdr = new BufferedReader(new StringReader(fasta_aligned_sequences)); String line; String cur_id = null; StringBuffer tmp = new StringBuffer(); while ((line = rdr.readLine()) != null) { //System.err.println(cur_id+" "+line); if (line.startsWith(">")) { count++; if (prev) { seq.append(fasta_accession2phylip(cur_id)+tmp+"\n"); cur_id = line.trim().substring(1); length = tmp.length(); tmp = new StringBuffer(); } else { prev = true; cur_id = line.trim().substring(1); } } else { tmp.append(line.trim()); } } // dont forget the last sequence if (tmp.length() > 0) { seq.append(fasta_accession2phylip(cur_id)+tmp+"\n"); } return " "+count+" "+length+"\n"+seq.toString(); } protected void waitForCompletion(MuscleClient cli, ExecutionContext exec, String jobId) throws Exception { if (jobId.length() > 0) { int check_period = 20 * 1000; // every 10s String status = "PENDING"; int retry = 0; while (status.equals("PENDING") || status.equals("RUNNING")) { try { logger.info("Waiting for "+jobId); status = cli.checkStatus(jobId); if (status.equals("RUNNING") || status.equals("PENDING")) { logger.info(jobId + " " + status + ", sleeping for "+check_period+ " milliseconds"); // check ten times each check_period to see if the user pressed cancel for (int i=0; i<10; i++) { Thread.sleep(check_period / 10); exec.checkCanceled(); } // each time job is still going, we double check_period to reduce likelihood of overloading EBI check_period *= 2; if (check_period > 200000) { check_period = 200000; } } if (status == "FAILED") { logger.error("MUSCLE job failed: "+jobId); } } catch (IOException e) { if (retry < 3) { logger.warn("Unable to check job "+jobId+" retrying (after linear-backoff delay)... "); Thread.sleep(((420 * retry) + 120)* 1000); status = "PENDING"; retry++; } else { throw new Exception("Cannot check job "+jobId+" via MUSCLE (EBI)... aborting"+e); } } } } else { throw new Exception("Bogus EBI job id... aborting!"); } } /** * {@inheritDoc} */ @Override protected void reset() { // TODO Code executed on reset. // Models build during execute are cleared here. // Also data handled in load/saveInternals will be erased here. } protected DataTableSpec make_output_spec() { DataColumnSpec[] cols = new DataColumnSpec[3]; cols[0] = new DataColumnSpecCreator("MUSCLE@EBI JobID", StringCell.TYPE).createSpec(); cols[1] = new DataColumnSpecCreator("MUSCLE Aligned Sequences", MultiAlignmentCell.TYPE).createSpec(); cols[2] = new DataColumnSpecCreator("MUSCLE output", StringCell.TYPE).createSpec(); return new DataTableSpec(cols); } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { return new DataTableSpec[] { new DataTableSpec(inSpecs[0], make_output_spec()) }; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_email.saveSettingsTo(settings); m_seq_col.saveSettingsTo(settings); m_accsn_col.saveSettingsTo(settings); m_alignment_type.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_email.loadSettingsFrom(settings); m_seq_col.loadSettingsFrom(settings); m_accsn_col.loadSettingsFrom(settings); m_alignment_type.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_email.validateSettings(settings); m_seq_col.validateSettings(settings); m_accsn_col.validateSettings(settings); m_alignment_type.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { File file = new File(internDir, "muscle-internals.xml"); FileInputStream fis = new FileInputStream(file); ModelContentRO modelContent = ModelContent.loadFromXML(fis); try { String[] keys = modelContent.getStringArray("internal-muscle-map-keys"); m_muscle_map.clear(); ModelContentRO subkey = modelContent.getModelContent("internal-muscle-map"); for (String key : keys) { DataCell dc = subkey.getDataCell(key); if (dc instanceof MultiAlignmentCell) { m_muscle_map.put(key, (MultiAlignmentCell) dc); } } fis.close(); } catch (InvalidSettingsException e) { throw new IOException(e.getMessage()); } } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { // save m_muscle_map... ModelContent modelContent = new ModelContent("muscle-internals.model"); String[] keys = m_muscle_map.keySet().toArray(new String[0]); modelContent.addStringArray("internal-muscle-map-keys", keys); ModelContentWO subkey = modelContent.addModelContent("internal-muscle-map"); for (String key : keys) { subkey.addDataCell(key, m_muscle_map.get(key)); } // create the XML file alongside the rest of the node data (same folder) File file = new File(internDir, "muscle-internals.xml"); FileOutputStream fos = new FileOutputStream(file); modelContent.saveToXML(fos); } }